import os
import pandas as pd
import csv
import numpy as np
import math
from matplotlib.lines import Line2D
import matplotlib as mpl
from matplotlib.patches import Patch
import matplotlib.pyplot as plt
import random
from scipy.spatial import distance
from scipy import stats
from sklearn import manifold
from sklearn.decomposition import PCA
import statsmodels.stats.multitest as sm
This file includes the results of a microbiome analysis performed on samples taken from four individuals that were originally used to determine the “Impact of DNA source on genetic variant detection from human whole-genome sequencing data”.
This included blood, saliva and buccal samples taken from four individuals (blood samples were taken at a different time than saliva and buccal samples). Additionally, a methylation-based enrichment for eukaryotic DNA was performed on the saliva and buccal samples.
Sections not completed:
- Absolute number of reads between different reference databases
- The intersection between different reference databases
- Functional profiling using HUMAnN2 (if I want to look more into the families that are differentially abundant)
- HUMAnN3 taxonomy
- Functional profiling using HUMAnN3
- Functional profiling using MEGAHIT and anvi’o
- Some kind of functional richness?
Fastq.gz files were downloaded from the ENA database, project accession number PRJNA523344
Kneaddata was used for quality control and removal of human sequences. This included:
- Trimmomatic 0.39: “SLIDINGWINDOW:4:20 MINLEN:50”
- Bowtie2 with the GRCh38_PhiX database (to remove human and PhiX reads): “–fast –dovetail”
parallel -j 2 --link 'kneaddata -i {1} -i {2} -o kneaddata_out/ \
-db /home/shared/bowtiedb/GRCh38_PhiX --trimmomatic /home/robyn/Trimmomatic-0.39/ \
-t 40 --trimmomatic-options "SLIDINGWINDOW:4:20 MINLEN:50" \
--bowtie2-options "--fast --dovetail" --remove-intermediate-output' \
::: raw_data/*_1.fastq.gz ::: raw_data/*_2.fastq.gz
mkdir kneaddata_out/contam_seq
mv kneaddata_out/*_contam*.fastq kneaddata_out/contam_seq
kneaddata_read_count_table --input kneaddata_out --output kneaddata_read_counts.txt
concat_paired_end.pl -p 4 --no_R_match -o cat_reads kneaddata_out/*_paired_*.fastq
#3
#set up colors function (to get up to 120 colors, but with up to 40 unique colors)
def get_cols(num):
colormap_20, colormap_40b, colormap_40c = mpl.cm.get_cmap('tab20', 256), mpl.cm.get_cmap('tab20b', 256), mpl.cm.get_cmap('tab20c', 256)
norm, norm2 = mpl.colors.Normalize(vmin=0, vmax=19), mpl.colors.Normalize(vmin=20, vmax=39)
m1, m2, m3 = mpl.cm.ScalarMappable(norm=norm, cmap=colormap_20), mpl.cm.ScalarMappable(norm=norm, cmap=colormap_40b), mpl.cm.ScalarMappable(norm=norm2, cmap=colormap_40c)
colors_20 = [m1.to_rgba(a) for a in range(20)]
colors_40 = [m2.to_rgba(a) for a in range(20)]+[m3.to_rgba(a) for a in range(20,40)]
if num < 21: return colors_20
elif num < 41: return colors_40
else: return colors_40+colors_40+colors_40
#and colors and shapes for different participants and body sites
colors_dict, shapes_dict = {'Blood':'#900C3F', 'Saliva':'#016F85', 'Buccal':'#ff8300', 'Saliva_euk':'#02aed1', 'Buccal_euk':'#FFC300'}, {'Huref':'o', 'PGPC-0002':'^', 'PGPC-0005':'*', 'PGPC-0006':'s', 'PGPC-0050':'p'}
#4
#get numbers of reads for different steps
reads = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/read_counts.txt', sep='\t', index_col=3, header=0)
participant_dict, site_dict, full_name_dict = {}, {}, {}
samples = list(reads.index.values)
for s in samples:
participant_dict[s] = reads.loc[s, 'Participant']
site_dict[s] = reads.loc[s, 'Body site']
full_name_dict[s] = reads.loc[s, 'Participant']+' '+reads.loc[s, 'Body site']
total_reads = pd.DataFrame(reads.loc[:, 'cat_reads'])
sample_names = [participant_dict[name]+' '+site_dict[name] for name in samples]
colors = [colors_dict[s] for s in list(reads.loc[:, 'Body site'].values)]
shapes = [shapes_dict[s] for s in list(reads.loc[:, 'Participant'].values)]
#5
plt.figure(figsize=(10, 5))
ax1, ax2 = plt.subplot(121), plt.subplot(122)
plt.sca(ax1)
plt.bar(list(reads.index.values), reads.loc[:, 'Percentage'].values, color=colors, edgecolor='k')
plt.xticks(list(reads.index.values), sample_names, rotation=90)
plt.ylabel('Reads kept (%)')
plt.xlim([-0.5,20.5])
plt.sca(ax2)
plt.bar(list(reads.index.values), reads.loc[:, 'Percentage'].values, color=colors, edgecolor='k')
plt.semilogy()
plt.xticks(list(reads.index.values), sample_names, rotation=90)
plt.ylabel('Log reads kept (%)')
handles = [Patch(facecolor=colors_dict[color], edgecolor='k', label=color) for color in colors_dict]
ax2.legend(handles=handles, bbox_to_anchor=(1.4,1.05))
plt.xlim([-0.5,20.5])
plt.tight_layout()
plt.show()
#6
plt.figure(figsize=(10, 5))
ax1, ax2 = plt.subplot(121), plt.subplot(122)
plt.sca(ax1)
plt.bar(list(reads.index.values), reads.loc[:, 'cat_reads'].values, color=colors, edgecolor='k')
plt.xticks(list(reads.index.values), sample_names, rotation=90)
plt.xlim([-0.5,20.5])
plt.ylabel('Reads remaining')
plt.sca(ax2)
plt.bar(list(reads.index.values), reads.loc[:, 'cat_reads'].values, color=colors, edgecolor='k')
plt.semilogy()
plt.xticks(list(reads.index.values), sample_names, rotation=90)
plt.ylabel('Log reads remaining')
handles = [Patch(facecolor=colors_dict[color], edgecolor='k', label=color) for color in colors_dict]
ax2.legend(handles=handles, bbox_to_anchor=(1.4,1.05))
plt.xlim([-0.5,20.5])
plt.tight_layout()
plt.show()
#7
reads_remain = reads.loc[:, ['Percentage', 'cat_reads']].rename(index=full_name_dict, columns = {'cat_reads':'Number'})
#8
py$reads_remain %>%
kable() %>%
kable_styling()
| Percentage | Number | |
|---|---|---|
| Huref Blood | 0.2319067 | 1087311 |
| PGPC-0002 Blood | 0.5748318 | 3321664 |
| PGPC-0005 Blood | 0.5149286 | 1903482 |
| PGPC-0006 Blood | 0.7146067 | 3593441 |
| PGPC-0050 Blood | 0.6870164 | 2940390 |
| PGPC-0002 Saliva | 3.9289510 | 16561343 |
| PGPC-0005 Saliva | 55.6458826 | 244058333 |
| PGPC-0006 Saliva | 13.1608121 | 60497393 |
| PGPC-0050 Saliva | 56.3700720 | 257049004 |
| PGPC-0002 Buccal | 2.8112226 | 11462781 |
| PGPC-0005 Buccal | 2.8202504 | 12751614 |
| PGPC-0006 Buccal | 5.1521560 | 22434503 |
| PGPC-0050 Buccal | 2.4297107 | 10667011 |
| PGPC-0002 Saliva_euk | 1.2333408 | 5345888 |
| PGPC-0005 Saliva_euk | 8.3104464 | 38138382 |
| PGPC-0006 Saliva_euk | 1.9067564 | 8786635 |
| PGPC-0050 Saliva_euk | 5.6187272 | 25037552 |
| PGPC-0002 Buccal_euk | 0.9385856 | 4380036 |
| PGPC-0005 Buccal_euk | 1.2119820 | 5584415 |
| PGPC-0006 Buccal_euk | 1.5567291 | 7232280 |
| PGPC-0050 Buccal_euk | 1.3836141 | 6382288 |
The taxonomy has been profiled using:
1. HUMAnN
- HUMAnN2 and MetaPhlAn2
- HUMAnN3 and MetaPhlAn3
2. Kraken2 with Bracken
- GTDB (no confidence parameter set) -
using the database constructed using Struo, release 89 - GTDB (confidence = 0.1)
- Minikraken v1 (no human genome, no confidence parameter set)
- Minikraken v1 (no human genome, confidence = 0.1)
- Minikraken v2 (with human genome, no confidence parameter set)
- Minikraken v2 (with human genome, confidence = 0.1)
- RefSeq Complete v93 (no confidence parameter set)
- RefSeq Complete v93 (confidence = 0.1)
Commands run:
humann2_databases --download chocophlan full humann_database/
humann2_databases --download uniref uniref90_diamond humann_database/
# Do this for each of uniref_90 and uniref_50
parallel -j 4 'humann2 --threads 10 --input {} --output humann2_out_90/{/.} --protein-database humann_database/uniref_90/' ::: cat_reads/*fastq
mkdir humann2_log_90
python
import os
samples = os.listdir('humann2_out_90')
for sample in samples:
os.system('cp humann2_out_90/'+sample+'/'+sample+'_humann2_temp/'+sample+'.log humann2_log_90/')
quit()
mkdir humann2_final_out_90
humann2_join_tables -s --input humann2_out_90/ --file_name pathabundance --output humann2_final_out_90/humann2_pathabundance.tsv
humann2_join_tables -s --input humann2_out_90/ --file_name pathcoverage --output humann2_final_out_90/humann2_pathcoverage.tsv
humann2_join_tables -s --input humann2_out_90/ --file_name genefamilies --output humann2_final_out_90/humann2_genefamilies.tsv
mkdir humann2_final_out_90/bugs_lists/
python
import os
samples = os.listdir('humann2_out_90')
for sample in samples:
os.system('cp humann2_out_90/'+sample+'/'+sample+'_humann2_temp/'+sample+'_metaphlan_bugs_list.tsv humann2_final_out_90/bugs_lists/')
quit()
merge_metaphlan_tables.py humann2_final_out_90/bugs_lists/*tsv > humann2_final_out_90/metaphlan_merged.tsv
rm -r humann2_final_out_90/bugs_lists
cd humann2_final_out_90/
humann2_renorm_table --input humann2_pathabundance.tsv --units relab --output humann2_pathabundance_relab.tsv
humann2_split_stratified_table --input humann2_pathabundance_relab.tsv --output ./
humann2_renorm_table --input humann2_genefamilies.tsv --units relab --output humann2_genefamilies_relab.tsv
humann2_split_stratified_table --input humann2_genefamilies_relab.tsv --output ./
pip install humann
pip install metaphlan
humann_databases --download chocophlan full humann_databases/humann3_databases/ --update-config yes
humann_databases --download uniref uniref90_diamond humann_databases/humann3_databases/ --update-config yes
humann_databases --download uniref uniref50_diamond humann_databases/humann3_databases/ --update-config yes
humann_databases --download utility_mapping full humann_databases/humann3_databases/ --update-config yes
wget https://github.com/bbuchfink/diamond/releases/download/v0.9.24/diamond-linux64.tar.gz
tar xvf diamond-linux64.tar.gz
parallel -j 1 'humann --input {} --output human_metagenome/humann3_out/ --threads 12' ::: human_metagenome/cat_reads/*.fastq
merge_metaphlan_tables.py humann_final_out/bugs_lists/*tsv > humann3_final_out/metaphlan_merged.tsv
rm -r humann3_final_out/bugs_lists
humann_join_tables -s --input humann3_out/ --file_name pathabundance --output humann3_final_out/humann3_pathabundance.tsv
humann_join_tables -s --input humann3_out/ --file_name pathcoverage --output humann3_final_out/humann3_pathcoverage.tsv
humann_join_tables -s --input humann3_out/ --file_name genefamilies --output humann3_final_out/humann3_genefamilies.tsv
humann_renorm_table --input humann3_pathabundance.tsv --units relab --output humann3_pathabundance_relab.tsv
humann_split_stratified_table --input humann3_pathabundance_relab.tsv --output ./
humann_renorm_table --input humann3_genefamilies.tsv --units relab --output humann3_genefamilies_relab.tsv
humann_split_stratified_table --input humann3_genefamilies_relab.tsv --output ./
find . -name \*aligned.sam -type f -delete
find . -name \*unaligned.fa -type f -delete
find . -name \*aligned.tsv -type f -delete
humann_renorm_table --input humann3_genefamilies.tsv --output humann3_genefamilies_cpm.tsv --units cpm --update-snames
humann_regroup_table --input humann3_genefamilies_cpm.tsv --output humann3_genefamilies_cpm_ko_50.tsv --groups uniref50_ko
humann_regroup_table --input humann3_genefamilies_cpm.tsv --output humann3_genefamilies_cpm_ko_90.tsv --groups uniref90_ko
#GTDB
wget http://ftp.tue.mpg.de/ebio/projects/struo/GTDB_release89/kraken2/
#RefSeq complete
sudo mount -t ramfs none /scratch/ramdisk/
sudo cp -a $DBNAME /ramdisk
sudo cp -a /home/shared/Kraken2.0.8_Bracken150mer_RefSeqCompleteV93/ /scratch/ramdisk/
#For each database (with and without adding --confidence 0.1:
parallel -j 1 'kraken2 --use-names --threads 10 --db gtdb/kraken2_struo/ --fastq-input {} --report kraken2_out/{/.}.report > kraken2_out/{/.}.kraken' ::: human_metagenome/cat_reads/*.fastq
parallel -j 1 'bracken -d gtdb/kraken2_struo/ -i {} -l S -o {.}.bracken' ::: kraken2_report/*.kreport
#9
#get the taxonomy file and sort it to strain and genus level
taxa = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/metaphlan_humann/processing/humann2_final_out_90/metaphlan_merged.tsv', sep='\t', header=0, index_col=0)
tax_names = list(taxa.index.values)
keeping = []
for a in range(len(tax_names)):
if 't__' in tax_names[a]:
keeping.append(True)
elif 'unclassified' in tax_names[a]:
keeping.append(True)
else:
keeping.append(False)
strain = taxa.loc[keeping, :]
strain_names = list(strain.index.values)
strain_dict = {}
for i in range(len(strain_names)):
strain_dict[strain_names[i]] = strain_names[i].split('|s__')[0].split('|g__')[1]
genus = strain.rename(index=strain_dict)
genus = genus.groupby(by=genus.index, axis=0).sum()
strain_t = strain.transpose()
genus_t = genus.transpose()
#10
#define the function that calculates the nmds plots
def transform_for_NMDS(df, dist_met='braycurtis'):
X = df.iloc[0:].values
y = df.iloc[:,0].values
seed = np.random.RandomState(seed=3)
X_true = X
similarities = distance.cdist(X_true, X_true, dist_met)
mds = manifold.MDS(n_components=2, max_iter=3000, eps=1e-9, random_state=seed,
dissimilarity="precomputed", n_jobs=1)
#print(similarities)
pos = mds.fit(similarities).embedding_
nmds = manifold.MDS(n_components=2, metric=False, max_iter=3000, eps=1e-12,
dissimilarity="precomputed", random_state=seed, n_jobs=1,
n_init=1)
npos = nmds.fit_transform(similarities, init=pos)
# Rescale the data
pos *= np.sqrt((X_true ** 2).sum()) / np.sqrt((pos ** 2).sum())
npos *= np.sqrt((X_true ** 2).sum()) / np.sqrt((npos ** 2).sum())
# Rotate the data
clf = PCA()
X_true = clf.fit_transform(X_true)
pos = clf.fit_transform(pos)
npos = clf.fit_transform(npos)
return pos, npos, nmds.stress_
handles1 = [Patch(facecolor=colors_dict[color], edgecolor='k', label=color) for color in colors_dict]
handles2 = [Line2D([0], [0], marker=shapes_dict[shape], color='w', label=shape, markerfacecolor='k', markersize=15) for shape in shapes_dict]
#11
strain_pos, strain_npos, strain_stress = transform_for_NMDS(strain_t, 'braycurtis')
genus_pos, genus_npos, genus_stress = transform_for_NMDS(genus_t, 'braycurtis')
plt.figure(figsize=(10,5))
ax1, ax2 = plt.subplot(121), plt.subplot(122)
for a in range(len(strain_npos)):
ax1.scatter(strain_npos[a,0], strain_npos[a,1], marker=shapes[a], color=colors[a], s=100)
ax2.scatter(genus_npos[a,0], genus_npos[a,1], marker=shapes[a], color=colors[a], s=100)
ax1.set_xlabel('nMDS1')
ax2.set_xlabel('nMDS1')
ax1.set_ylabel('nMDS2')
ax1.set_title('Strain')
ax2.set_title('Genus')
ax2.legend(handles=handles1+handles2, bbox_to_anchor=(1,1))
plt.tight_layout()
plt.show()
#12
strain_pos, strain_npos, strain_stress = transform_for_NMDS(strain_t, 'euclidean')
genus_pos, genus_npos, genus_stress = transform_for_NMDS(genus_t, 'euclidean')
plt.figure(figsize=(10,5))
ax1, ax2 = plt.subplot(121), plt.subplot(122)
for a in range(len(strain_npos)):
ax1.scatter(strain_npos[a,0], strain_npos[a,1], marker=shapes[a], color=colors[a], s=100)
ax2.scatter(genus_npos[a,0], genus_npos[a,1], marker=shapes[a], color=colors[a], s=100)
ax1.set_xlabel('nMDS1')
ax2.set_xlabel('nMDS1')
ax1.set_ylabel('nMDS2')
ax1.set_title('Strain')
ax2.set_title('Genus')
ax2.legend(handles=handles1+handles2, bbox_to_anchor=(1.4,1))
plt.tight_layout()
plt.show()
#13
strain_pos, strain_npos, strain_stress = transform_for_NMDS(strain_t, 'jaccard')
genus_pos, genus_npos, genus_stress = transform_for_NMDS(genus_t, 'jaccard')
plt.figure(figsize=(10,5))
ax1, ax2 = plt.subplot(121), plt.subplot(122)
for a in range(len(strain_npos)):
ax1.scatter(strain_npos[a,0], strain_npos[a,1], marker=shapes[a], color=colors[a], s=100)
ax2.scatter(genus_npos[a,0], genus_npos[a,1], marker=shapes[a], color=colors[a], s=100)
ax1.set_xlabel('nMDS1')
ax2.set_xlabel('nMDS1')
ax1.set_ylabel('nMDS2')
ax1.set_title('Strain')
ax2.set_title('Genus')
ax2.legend(handles=handles1+handles2, bbox_to_anchor=(1.4,1))
plt.tight_layout()
plt.show()
Here the relative abundance of taxa calulated by MetaPhlAn2 are plotted at the Kingdom level for each sample.
#14
plt.figure(figsize=(7,5))
ax1 = plt.subplot(111)
plt.bar(list(taxa.columns.values), taxa.loc['k__Viruses', :].values, color='#C70039', edgecolor='k')
plt.bar(list(taxa.columns.values), taxa.loc['k__Bacteria', :].values, bottom=taxa.loc['k__Viruses', :].values, color='#026B81', edgecolor='k')
#plt.xticks(list(taxa.columns.values), sample_names, rotation=90)
empty = []
for x in range(0,21):
empty.append('')
ax1.text(x, -2, sample_names[x], color=colors[x], rotation=90, va='top', ha='center')
plt.xticks(range(0, 21), empty)
plt.xlim([-0.5,20.5])
handles = [Patch(facecolor='#C70039', edgecolor='k', label='Viruses'), Patch(facecolor='#026B81', edgecolor='k', label='Bacteria')]
plt.legend(handles=handles, bbox_to_anchor=(1,1.05))
plt.tight_layout()
plt.show()
Here the relative abundance of taxa calulated by MetaPhlAn2 are plotted at the Genus level for each sample. Genera with below 1% maximum relative abundance have been removed.
#15
genus = genus[genus.max(axis=1) > 1]
genera = list(genus.index.values)
plt.figure(figsize=(10,5))
ax1 = plt.subplot(111)
gen_colors = get_cols(len(genus.index.values))
handles = []
for g in range(len(genera)):
this_gen = genus.loc[genera[g], :].values
if g == 0:
ax1.bar(list(genus.columns.values), this_gen, color=gen_colors[g], edgecolor='k')
total = this_gen
else:
ax1.bar(list(genus.columns.values), this_gen, bottom=total, color=gen_colors[g], edgecolor='k')
total = this_gen+total
handles.append(Patch(facecolor=gen_colors[g], edgecolor='k', label=genera[g]))
empty = []
for x in range(0,21):
empty.append('')
ax1.text(x, -2, sample_names[x], color=colors[x], rotation=90, va='top', ha='center')
plt.xticks(range(0, 21), empty)
plt.xlim([-0.5,20.5])
plt.legend(handles=handles, bbox_to_anchor=(1,1.05), ncol=2)
plt.tight_layout()
plt.show()
#16
#get all samples into dataframes based on the database that they use
folders = sorted(os.listdir('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2'))
del folders[0]
kraken_columns = {0:'Percent fragments clade', 1:'Number fragments clade', 2:'Number fragments taxon', 3:'Level', 4:'NCBI ID', 5:'Taxon name'}
kraken_all_db, bracken_all_db, all_domains = [], [], {}
for fol in folders:
#if fol != 'gtdb_conf': continue
if fol == 'db_genera' or fol == 'db_genera_in_common':
continue
if not os.path.isdir('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/'+fol):
continue
bracken, kraken, bracken_kreport = [], [], []
bracken_pd, kraken_pd = [], []
for fi in sorted(os.listdir('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/'+fol)):
if fi[-7:] == 'bracken':
bracken.append(fi)
elif fi[-7:] == 'kreport' and 'bracken' not in fi:
kraken.append(fi)
elif fi[-7:] == 'kreport':
bracken_kreport.append(fi)
for bk in bracken_kreport:
with open('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/'+fol+'/'+bk, 'rU') as f:
bk = []
domains = {}
this_domain, domain_name = [], ''
for row in csv.reader(f, delimiter='\t'):
bk.append(row)
row[5] = row[5].lstrip()
if row[3] == 'D':
if domain_name != '':
domains[domain_name] = this_domain
this_domain, domain_name = [], row[5]
else:
if row[3] != 'R' and row[3] != 'U' and 'D' not in row[3]:
this_domain.append(row[5])
domains[domain_name] = this_domain
for domain in domains:
if domain in all_domains:
all_domains[domain] = list(set(all_domains[domain]+domains[domain]))
else:
all_domains[domain] = list(set(domains[domain]))
for b in bracken:
if len(b) > 22:
continue
sample = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/'+fol+'/'+b, sep='\t', header=0, index_col=0)
b = b.replace('_150', '')
sample.drop(['taxonomy_id', 'taxonomy_lvl', 'kraken_assigned_reads', 'added_reads', 'fraction_total_reads'], axis=1, inplace=True)
sample.rename(columns={'new_est_reads':b[:-8]}, inplace=True)
bracken_pd.append(sample)
for k in kraken:
sample = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/'+fol+'/'+k, sep='\t', header=None, index_col=3)
sample = sample.loc[['U', 'D'], :]
if sample.iloc[1,1] == 'x':
sample.iloc[1,1] = sample.iloc[1,2]
sample = sample.rename(columns=kraken_columns).drop(['Number fragments taxon', 'NCBI ID'], axis=1).rename(columns={'Percent fragments clade':k[:-8]+'_percent', 'Number fragments clade':k[:-8]+'_reads'}).set_index('Taxon name')
taxa = list(sample.index.values)
taxa_dict = {}
for t in taxa:
taxa_dict[t] = t.replace(' ', '')
sample = sample.rename(index=taxa_dict)
kraken_pd.append(sample)
bracken = pd.concat(bracken_pd, join='outer')
kraken = pd.concat(kraken_pd, join='outer')
kraken.to_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/gtdb_conf_test.csv')
kraken = kraken.rename(index={'d__Bacteria':'Bacteria', 'd__Archaea':'Archaea'})
kraken = kraken.groupby(by=kraken.index, axis=0).sum()
bracken = bracken.groupby(by=bracken.index, axis=0).sum().fillna(value=0)
kraken_all_db.append(kraken), bracken_all_db.append(bracken)
#17
x1 = [x for x in range(21)]
x2 = [x+0.3 for x in range(21)]
tax_plotting = ['Archaea', 'Bacteria', 'Eukaryota', 'Viruses', 'unclassified']
color_plotting = ['#EDBB99', '#5499C7', '#7DCEA0', '#F7DC6F', '#CCD1D1']
tax_paper = ['Bacteria', 'Eukaryota', 'Other', 'Unclassified']
color_paper = ['#5499C7', '#7DCEA0', '#CD6155', '#CCD1D1']
from_paper = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/from_paper.csv', header=0, index_col=0)
def get_summary_reads(kraken_db):
fig = plt.figure(figsize=(15,15))
ax1, ax2, ax3, ax4, ax5 = plt.subplot(321), plt.subplot(322), plt.subplot(323), plt.subplot(324), plt.subplot(325)
ax1.set_title('Minikraken V1\n(without human genome)'), ax2.set_title('Minikraken V2\n(with human genome)'), ax3.set_title('GTDB'), ax4.set_title('RefSeq Complete V93')
ax5.set_title('From paper')
ax_plot = [ax3, ax3, ax1, ax1, ax2, ax2, ax4, ax4]
x_plot = [x1, x2, x1, x2, x1, x2, x1, x2]
axs = [ax1, ax2, ax3, ax4, ax5]
for s in range(len(samples)):
if s == 0:
continue
bottom = 0
for t in range(len(tax_paper)):
ax5.bar(x1[s], from_paper.loc[tax_paper[t], samples[s]], bottom=bottom, color=color_paper[t], edgecolor='k', width=0.6)
bottom += from_paper.loc[tax_paper[t], samples[s]]
for db in range(len(kraken_db)):
ax_using = ax_plot[db]
x = x_plot[db]
db = kraken_db[db]
handles = []
for tax in range(len(tax_plotting)):
handles.append(Patch(facecolor=color_plotting[tax], edgecolor='k', label=tax_plotting[tax]))
tax = tax_plotting[tax]
if tax not in list(db.index.values):
db.loc[tax] = [0 for i in range(db.shape[1])]
handles.append(Patch(facecolor=color_paper[2], edgecolor='k', label='Other'))
db = db.fillna(value=0)
for s in range(len(samples)):
bottom = 0
for t in range(len(tax_plotting)):
prop = db.loc[tax_plotting[t], samples[s]+'_reads']
cat = total_reads.loc[samples[s], 'cat_reads']
prop = (prop/cat)*100
ax_using.bar(x[s], prop, bottom=bottom, color=color_plotting[t], edgecolor='k', width=0.3)
bottom += prop
ax2.legend(handles=handles, bbox_to_anchor=(1,1.05))
for ax in axs:
plt.sca(ax)
plt.xticks(x1, ['' for x in x1])
plt.ylim([0, 100])
plt.xlim([-0.5, 20.5])
for x in x1:
ax5.text(x, -2, sample_names[x], color=colors[x], rotation=90, va='top', ha='center')
ax4.text(x, -2, sample_names[x], color=colors[x], rotation=90, va='top', ha='center')
ax1.set_ylabel('Classified (%)'), ax3.set_ylabel('Classified (%)'), ax5.set_ylabel('Classified(%)')
#plt.tight_layout()
return
def get_summary_bacteria(kraken_db):
tax_plotting = ['Bacteria']
alpha = ['#5499C7', '#F1C40F', '#5499C7', '#F1C40F', '#5499C7', '#F1C40F', '#5499C7', '#F1C40F']
fig = plt.figure(figsize=(10,8))
ax1, ax2, ax3, ax4 = plt.subplot(221), plt.subplot(222), plt.subplot(223), plt.subplot(224)
ax1.set_title('Minikraken V1\n(without human genome)'), ax2.set_title('Minikraken V2\n(with human genome)'), ax3.set_title('GTDB'), ax4.set_title('RefSeq Complete V93')
ax_plot = [ax3, ax3, ax1, ax1, ax2, ax2, ax4, ax4]
x_plot = [x1, x2, x1, x2, x1, x2, x1, x2]
axs = [ax1, ax2, ax3, ax4]
for db in range(len(kraken_db)):
ax_using = ax_plot[db]
x = x_plot[db]
alp = alpha[db]
db = kraken_db[db]
for tax in range(len(tax_plotting)):
tax = tax_plotting[tax]
if tax not in list(db.index.values):
db.loc[tax] = [0 for i in range(db.shape[1])]
db = db.fillna(value=0)
for s in range(len(samples)):
bottom = 0
for t in range(len(tax_plotting)):
prop = db.loc[tax_plotting[t], samples[s]+'_reads']
cat = total_reads.loc[samples[s], 'cat_reads']
ax_using.bar(x[s], prop, bottom=bottom, color=alp, edgecolor='k', width=0.3)
bottom += prop
handles = []
handles.append(Patch(facecolor=alpha[0], edgecolor='k', label='No confidence value'))
handles.append(Patch(facecolor=alpha[1], edgecolor='k', label='Confidence=0.1'))
ax2.legend(handles=handles, bbox_to_anchor=(1.6,1.03))
for ax in axs:
plt.sca(ax)
plt.xticks(x1, ['' for x in x1])
plt.semilogy()
plt.xlim([-0.5, 20.5])
#plt.ylim([0, 100])
plt.xlim([-0.5, 20.5])
for x in x1:
pl = ((1/21)*(x+1))-0.02
ax3.text(pl, -0.03, sample_names[x], color=colors[x], rotation=90, va='top', ha='center', transform=ax3.transAxes)
ax4.text(pl, -0.03, sample_names[x], color=colors[x], rotation=90, va='top', ha='center', transform=ax4.transAxes)
ax1.set_ylabel('Number of reads'), ax3.set_ylabel('Number of reads')
#plt.tight_layout()
return
A summary of the percentage of reads classified as different domains with different databases. Note that the ‘From paper’ plot uses the classifications given in the original paper, where 10,000 unmapped reads were classified using BLAST searches of the NCBI database.
#18
get_summary_reads(kraken_all_db)
plt.show()
Summary of the number of reads that are classified as bacteria by each database.
#19
get_summary_bacteria(kraken_all_db)
plt.tight_layout()
plt.show()
These first plots are all separately with the confidence parameter set. See the last tab for those without the confidence parameter set.
#20
db_names = ['gtdb', 'gtdb_conf', 'minikraken', 'minikraken_conf', 'minikraken_human', 'minikraken_human_conf', 'refseq', 'refseq_conf']
bacteria = all_domains['Bacteria']+all_domains['d__Bacteria']
genera, gen_names, genera_1, gen_names_1, strain, gen_sums = [], [], [], [], [], []
for db in range(len(bracken_all_db)):
d = int(db)
db = bracken_all_db[db]
species = list(db.index.values)
keeping = []
species_dict = {}
for sp in species:
if sp in bacteria:
keeping.append(True)
new_sp = sp.split('__')
if len(new_sp) > 1:
new_sp = new_sp[1]
else:
new_sp = new_sp[0]
species_dict[sp] = new_sp.split(' ')[0].replace("'", '')
else:
keeping.append(False)
in_len = db.shape[0]
db = db.loc[keeping, :]
strain.append(db)
db = db.rename(index=species_dict)
db = db.groupby(by=db.index, axis=0).sum()
sums = db.sum(axis=0)
gen_sums.append(sums)
db = db.divide(sums, axis=1).multiply(100)
genera.append(db)
db.to_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/db_genera/'+db_names[d]+'.csv')
gen_names = gen_names+list(db.index.values)
db = db[db.max(axis=1) > 1]
genera_1.append(db)
gen_names_1 = gen_names_1+list(db.index.values)
gen_names = list(set(gen_names))
gen_names_1 = list(set(gen_names_1))
#21
def plot_four_nmds(dbs, metric, name):
fig = plt.figure(figsize=(15,10))
#fig.suptitle(name+metric+'\n\n\n')
ax1, ax2, ax3, ax4 = plt.subplot(221), plt.subplot(222), plt.subplot(223), plt.subplot(224)
axs = [ax3, ax1, ax2, ax4]
ax1.set_title('Minikraken V1\n(without human genome)'), ax2.set_title('Minikraken V2\n(with human genome)'), ax3.set_title('GTDB'), ax4.set_title('RefSeq Complete V93')
for db in range(len(dbs)):
n = db
db = dbs[db].transpose()
pos, npos, stress = transform_for_NMDS(db, metric)
for a in range(len(npos)):
axs[n].scatter(npos[a,0], npos[a,1], marker=shapes[a], color=colors[a], s=100, edgecolor='k')
axs[n].set_xlabel('nMDS1')
axs[n].set_ylabel('nMDS2')
handles1 = [Patch(facecolor=colors_dict[color], edgecolor='k', label=color) for color in colors_dict]
handles2 = [Line2D([0], [0], marker=shapes_dict[shape], color='w', label=shape, markerfacecolor='k', markersize=15) for shape in shapes_dict]
ax2.legend(handles=handles1+handles2, bbox_to_anchor=(1,1))
plt.tight_layout()
return
#22
plot_four_nmds([strain[1], strain[3], strain[5], strain[7]], 'braycurtis', 'NMDS confidence=0.1 strain ')
plt.show()
#23
plot_four_nmds([genera[1], genera[3], genera[5], genera[7]], 'braycurtis', 'NMDS confidence=0.1 genera ')
plt.show()
#24
plot_four_nmds([strain[1], strain[3], strain[5], strain[7]], 'euclidean', 'NMDS confidence=0.1 strain ')
plt.show()
#25
plot_four_nmds([genera[1], genera[3], genera[5], genera[7]], 'euclidean', 'NMDS confidence=0.1 genera ')
plt.show()
#26
plot_four_nmds([strain[1], strain[3], strain[5], strain[7]], 'jaccard', 'NMDS confidence=0.1 strain ')
plt.show()
#27
plot_four_nmds([genera[1], genera[3], genera[5], genera[7]], 'jaccard', 'NMDS confidence=0.1 genera ')
plt.show()
Bray-curtis distance at strain level
#28
plot_four_nmds([strain[0], strain[2], strain[4], strain[6]], 'braycurtis', 'NMDS no confidence strain ')
plt.show()
Bray-curtis distance at genus level
#29
plot_four_nmds([genera[0], genera[2], genera[4], genera[6]], 'braycurtis', 'NMDS no confidence genera ')
plt.show()
Euclidean distance at strain level
#30
plot_four_nmds([strain[0], strain[2], strain[4], strain[6]], 'euclidean', 'NMDS no confidence strain ')
plt.show()
Euclidean distance at genus level
#31
plot_four_nmds([genera[0], genera[2], genera[4], genera[6]], 'euclidean', 'NMDS no confidence genera ')
plt.show()
Jaccard distance at strain level
#32
plot_four_nmds([strain[0], strain[2], strain[4], strain[6]], 'jaccard', 'NMDS no confidence strain ')
plt.show()
Jaccard distance at genus level
#33
plot_four_nmds([genera[0], genera[2], genera[4], genera[6]], 'jaccard', 'NMDS no confidence genera ')
plt.show()
These plots are now only calculated for the classifications that used confidence = 0.1. Genera with below 1% maximum relative abundance are removed and the numbers in brackets are the number of reads that were classified as bacteria.
#34
db_names = ['gtdb', 'gtdb_conf', 'minikraken', 'minikraken_conf', 'minikraken_human', 'minikraken_human_conf', 'refseq', 'refseq_conf']
#bacteria = all_domains['Bacteria']+all_domains['d__Bacteria']
#genera, gen_names, genera_1, gen_names_1, strain, gen_sums = [], [], [], [], [], []
gen_names_1 = sorted(gen_names_1)
def plot_genera(db, sums, tax_cols, gen_names_1, dname):
plt.figure(figsize=(10,5))
ax1 = plt.subplot(111)
bottom = [0 for x in range(len(db.columns))]
handles = []
for g in range(len(gen_names_1)):
if gen_names_1[g] in db.index.values:
this_row = db.loc[gen_names_1[g], :].values
ax1.bar(x1, this_row, bottom=bottom, color=tax_cols[g], edgecolor='k')
handles.append(Patch(facecolor=tax_cols[g], edgecolor='k', label=gen_names_1[g]))
for b in range(len(bottom)):
bottom[b] += this_row[b]
ax1.legend(handles=handles, bbox_to_anchor=(1, 1.03), ncol=3)
plt.xticks(x1, ['' for x in x1])
plt.ylabel('Relative abundance(%)')
plt.xlim([-0.5, 20.5])
plt.ylim([0, 100])
for x in x1:
n = str(int(sums[samples[x]]))
ax1.text(x, -2, sample_names[x]+' ('+n+')', color=colors[x], rotation=90, va='top', ha='center')
plt.tight_layout()
return
gen_plot = [genera_1[1], genera_1[3], genera_1[5], genera_1[7]]
db_name = ['GTDB', 'Minikraken V1', 'Minikraken V2', 'RefSeq Complete V93']
all_sums = [gen_sums[1], gen_sums[3], gen_sums[5], gen_sums[7]]
tax_cols = get_cols(len(gen_names_1))
#35
plot_genera(gen_plot[1], all_sums[1], tax_cols, gen_names_1, db_name[1])
plt.show()
#36
plot_genera(gen_plot[2], all_sums[2], tax_cols, gen_names_1, db_name[2])
plt.show()
#37
plot_genera(gen_plot[0], all_sums[0], tax_cols, gen_names_1, db_name[0])
plt.show()
#38
plot_genera(gen_plot[3], all_sums[3], tax_cols, gen_names_1, db_name[3])
plt.show()
Here I have carried out ANCOM2 tests for differential abundance of genera between body sites. All genera are then plotted on a heatmap with mean values for each body site and stars to denote significant differences as determined by ANCOM.
While many of these results vary between databases, it is worth noting that some differences are consistent, e.g.:
1. Prevotella are always more abundant in saliva samples
2. Streptococcus are always more abundant in buccal samples
3. Klebsiella (where present) are always more abundant in blood samples
#39
def get_differences(new_genus, db_name):
new_genus = new_genus.drop(['SRR8595488'], axis=1)
new_genus = new_genus[new_genus.max(axis=1) > 0.1]
samples = list(new_genus.columns)
list_comparisons = [['Blood', 'Saliva'], ['Blood', 'Buccal'], ['Saliva', 'Buccal'], ['Saliva', 'Saliva_euk'], ['Buccal', 'Buccal_euk'], ['Blood', 'Saliva_euk'], ['Blood', 'Buccal_euk'], ['Saliva_euk', 'Buccal_euk']]
comparisons, metadata, comp_len = [], [], []
for a in range(len(list_comparisons)):
keeping = []
this_md = []
for b in range(len(samples)):
if site_dict[samples[b]] in list_comparisons[a]:
keeping.append(True)
this_md.append([samples[b], site_dict[samples[b]]])
else:
keeping.append(False)
this_comp = new_genus.loc[:, keeping]
this_comp = this_comp[this_comp.max(axis=1) > 0.1]
comparisons.append(this_comp)
this_md = pd.DataFrame(this_md, columns=['Samples', 'Groups'])
#this_md.set_index('Samples', inplace=True)
metadata.append(this_md)
comp_len.append(this_comp.shape[0])
new_genus = new_genus.rename(columns=site_dict, inplace=False)
return comparisons, metadata, new_genus, comp_len
db_name = ['GTDB', 'Minikraken V1', 'Minikraken V2', 'RefSeq Complete V93']
comparison_names = [r'Blood vs saliva', r'Blood vs buccal', r'Saliva vs buccal', r'Saliva vs saliva_euk', r'Buccal vs buccal_euk', r'Blood vs saliva_euk', r'Blood vs buccal_euk', r'Saliva_euk vs buccal_euk']
source("/Users/robynwright/Documents/OneDrive/Github/R-notebooks/ancom_v2.1.R")
get_ancom <- function(ft, md) {
all_ancom = list()
for (a in 1:8){
feature_table = ft[a]
meta_data = md[a]
process = feature_table_pre_process(feature_table, meta_data, 'Samples', 'Groups', lib_cut=10, neg_lb=TRUE)
ancom_out = ANCOM(process$feature_table, process$meta_data, process$struc_zero, main_var='Groups')
all_ancom[[a]] <- ancom_out$out
}
return(all_ancom)
}
def sort_ancom_results(r_ancom):
ancom_lists_09, ancom_lists_08, ancom_lists_07, ancom_lists_06 = [], [], [], []
for a in range(len(r_ancom)):
this_sig_09, this_sig_08, this_sig_07, this_sig_06 = [], [], [], []
r_ancom[a].set_index('taxa_id', inplace=True)
all_sp = list(r_ancom[a].index.values)
for b in range(len(all_sp)):
if r_ancom[a].loc[all_sp[b], 'detected_0.9'] == True:
this_sig_09.append(all_sp[b])
if r_ancom[a].loc[all_sp[b], 'detected_0.8'] == True:
this_sig_08.append(all_sp[b])
if r_ancom[a].loc[all_sp[b], 'detected_0.7'] == True:
this_sig_07.append(all_sp[b])
if r_ancom[a].loc[all_sp[b], 'detected_0.6'] == True:
this_sig_06.append(all_sp[b])
ancom_lists_09.append(this_sig_09), ancom_lists_08.append(this_sig_08), ancom_lists_07.append(this_sig_07), ancom_lists_06.append(this_sig_06)
return [ancom_lists_09, ancom_lists_08, ancom_lists_07, ancom_lists_06]
def plot_heatmap(new_genus, ANCOM):
print(ANCOM)
names = ['Blood', 'Saliva', 'Buccal', 'Saliva_euk', 'Buccal_euk']
other_names = ['Abundance', r'Blood $vs$ saliva', r'Blood $vs$ buccal', r'Saliva $vs$ buccal', r'Saliva $vs$ saliva_euk', r'Buccal $vs$ buccal_euk', r'Blood $vs$ saliva_euk', r'Blood $vs$ buccal_euk', r'Saliva_euk $vs$ buccal_euk']
colormap, norm = mpl.cm.get_cmap('plasma', 256), mpl.colors.Normalize(vmin=0, vmax=1)
m = mpl.cm.ScalarMappable(norm=norm, cmap=colormap)
if list(new_genus.index.values)[0][0] == 'A':
new_genus = new_genus.iloc[::-1]
figure = plt.figure(figsize=(5,new_genus.shape[0]*0.2))
ax1 = plt.subplot(111)
genus_names = list(new_genus.index.values)
y = []
for g in range(len(genus_names)):
this_row = new_genus.loc[genus_names[g], :]
values = [(np.mean(this_row[name].values)) for name in names]
bottom, top, x = [g for a in range(5)], [1 for a in range(5)], [a for a in range(5)]
y.append(g+0.5)
ma = max(values)
values = [v/ma for v in values]
values = [m.to_rgba(v) for v in values]
ax1.bar(x, top, bottom=bottom, color=values, edgecolor='k', width=1)
x.append(5)
ax1.scatter(x[-1], bottom[-1]+0.5, color='k', s=ma*5)
sig, x_plt = [], x[-1]
for a in range(len(ANCOM)):
x_plt += 0.75
if genus_names[g] in ANCOM[a]: ax1.scatter(x_plt, bottom[-1]+0.5, marker='*', color='k')
x.append(x_plt)
plt.ylim([0, len(genus_names)]), plt.xlim([-0.5, x[-1]+0.5])
plt.yticks(y, genus_names)
plt.xticks(x, names+other_names, rotation=90)
ax1.xaxis.set_ticks_position('top')
plt.tight_layout()
plt.show()
return
These differences are determined by ANCOM2 and the most conservative of these is the ANCOM 0.9 set, which is then used for plotting heatmaps.
this_genera_db = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/db_genera/minikraken_conf.csv', header=0, index_col=0)
comparisons, md, new_genus, comp_len = get_differences(this_genera_db, db_name[1])
c1, c2, c3, c4, c5, c6, c7, c8 = comparisons[0], comparisons[1], comparisons[2], comparisons[3], comparisons[4], comparisons[5], comparisons[6], comparisons[7]
md1, md2, md3, md4, md5, md6, md7, md8 = md[0], md[1], md[2], md[3], md[4], md[5], md[6], md[7]
ft = list(py$c1, py$c2, py$c3, py$c4, py$c5, py$c6, py$c7, py$c8)
md = list(py$md1, py$md2, py$md3, py$md4, py$md5, py$md6, py$md7, py$md8)
all_ancom = get_ancom(ft, md)
ancom = sort_ancom_results(r.all_ancom)
ancom_df = []
for c in range(len(comparison_names)):
this_row = [comparison_names[c], comp_len[c], len(ancom[3][c]), len(ancom[2][c]), len(ancom[1][c]), len(ancom[0][c])]
ancom_df.append(this_row)
ancom_df = pd.DataFrame(ancom_df, columns=['Comparison', 'Shared genera', 'ANCOM 0.6', 'ANCOM 0.7', 'ANCOM 0.8', 'ANCOM 0.9'])
py$ancom_df %>%
kable() %>%
kable_styling()
| Comparison | Shared genera | ANCOM 0.6 | ANCOM 0.7 | ANCOM 0.8 | ANCOM 0.9 |
|---|---|---|---|---|---|
| Blood vs saliva | 41 | 41 | 37 | 26 | 8 |
| Blood vs buccal | 37 | 27 | 21 | 15 | 3 |
| Saliva vs buccal | 45 | 3 | 3 | 2 | 1 |
| Saliva vs saliva_euk | 41 | 0 | 0 | 0 | 0 |
| Buccal vs buccal_euk | 39 | 2 | 0 | 0 | 0 |
| Blood vs saliva_euk | 38 | 32 | 24 | 15 | 3 |
| Blood vs buccal_euk | 34 | 11 | 5 | 2 | 2 |
| Saliva_euk vs buccal_euk | 43 | 5 | 3 | 2 | 1 |
Relative abundance is scaled within each genus, with yellow being highest in abundance and purple lowest. Blobs are scaled to maximum relative abundance. Stars denote genera that are significantly differentially abundant between body sites (ANCOM2 0.9).
plot_heatmap(new_genus, ancom[0])
These differences are determined by ANCOM2 and the most conservative of these is the ANCOM 0.9 set, which is then used for plotting heatmaps.
this_genera_db = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/db_genera/minikraken_human_conf.csv', header=0, index_col=0)
comparisons, md, new_genus, comp_len = get_differences(this_genera_db, db_name[1])
c1, c2, c3, c4, c5, c6, c7, c8 = comparisons[0], comparisons[1], comparisons[2], comparisons[3], comparisons[4], comparisons[5], comparisons[6], comparisons[7]
md1, md2, md3, md4, md5, md6, md7, md8 = md[0], md[1], md[2], md[3], md[4], md[5], md[6], md[7]
ft = list(py$c1, py$c2, py$c3, py$c4, py$c5, py$c6, py$c7, py$c8)
md = list(py$md1, py$md2, py$md3, py$md4, py$md5, py$md6, py$md7, py$md8)
all_ancom = get_ancom(ft, md)
ancom = sort_ancom_results(r.all_ancom)
ancom_df = []
for c in range(len(comparison_names)):
this_row = [comparison_names[c], comp_len[c], len(ancom[3][c]), len(ancom[2][c]), len(ancom[1][c]), len(ancom[0][c])]
ancom_df.append(this_row)
ancom_df = pd.DataFrame(ancom_df, columns=['Comparison', 'Shared genera', 'ANCOM 0.6', 'ANCOM 0.7', 'ANCOM 0.8', 'ANCOM 0.9'])
py$ancom_df %>%
kable() %>%
kable_styling()
| Comparison | Shared genera | ANCOM 0.6 | ANCOM 0.7 | ANCOM 0.8 | ANCOM 0.9 |
|---|---|---|---|---|---|
| Blood vs saliva | 49 | 48 | 47 | 38 | 9 |
| Blood vs buccal | 48 | 35 | 16 | 14 | 4 |
| Saliva vs buccal | 39 | 5 | 4 | 1 | 0 |
| Saliva vs saliva_euk | 30 | 0 | 0 | 0 | 0 |
| Buccal vs buccal_euk | 37 | 0 | 0 | 0 | 0 |
| Blood vs saliva_euk | 49 | 47 | 33 | 22 | 5 |
| Blood vs buccal_euk | 50 | 19 | 14 | 12 | 4 |
| Saliva_euk vs buccal_euk | 42 | 4 | 3 | 3 | 2 |
Relative abundance is scaled within each genus, with yellow being highest in abundance and purple lowest. Blobs are scaled to maximum relative abundance. Stars denote genera that are significantly differentially abundant between body sites (ANCOM2 0.9).
plot_heatmap(new_genus, ancom[0])
These differences are determined by ANCOM2 and the most conservative of these is the ANCOM 0.9 set, which is then used for plotting heatmaps.
this_genera_db = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/db_genera/gtdb_conf.csv', header=0, index_col=0)
comparisons, md, new_genus, comp_len = get_differences(this_genera_db, db_name[1])
c1, c2, c3, c4, c5, c6, c7, c8 = comparisons[0], comparisons[1], comparisons[2], comparisons[3], comparisons[4], comparisons[5], comparisons[6], comparisons[7]
md1, md2, md3, md4, md5, md6, md7, md8 = md[0], md[1], md[2], md[3], md[4], md[5], md[6], md[7]
ft = list(py$c1, py$c2, py$c3, py$c4, py$c5, py$c6, py$c7, py$c8)
md = list(py$md1, py$md2, py$md3, py$md4, py$md5, py$md6, py$md7, py$md8)
all_ancom = get_ancom(ft, md)
ancom = sort_ancom_results(r.all_ancom)
ancom_df = []
for c in range(len(comparison_names)):
this_row = [comparison_names[c], comp_len[c], len(ancom[3][c]), len(ancom[2][c]), len(ancom[1][c]), len(ancom[0][c])]
ancom_df.append(this_row)
ancom_df = pd.DataFrame(ancom_df, columns=['Comparison', 'Shared genera', 'ANCOM 0.6', 'ANCOM 0.7', 'ANCOM 0.8', 'ANCOM 0.9'])
py$ancom_df %>%
kable() %>%
kable_styling()
| Comparison | Shared genera | ANCOM 0.6 | ANCOM 0.7 | ANCOM 0.8 | ANCOM 0.9 |
|---|---|---|---|---|---|
| Blood vs saliva | 98 | 88 | 82 | 32 | 6 |
| Blood vs buccal | 91 | 21 | 14 | 8 | 5 |
| Saliva vs buccal | 96 | 18 | 10 | 2 | 1 |
| Saliva vs saliva_euk | 93 | 3 | 2 | 0 | 0 |
| Buccal vs buccal_euk | 95 | 8 | 5 | 1 | 0 |
| Blood vs saliva_euk | 92 | 27 | 17 | 11 | 4 |
| Blood vs buccal_euk | 75 | 7 | 7 | 3 | 2 |
| Saliva_euk vs buccal_euk | 97 | 19 | 13 | 7 | 1 |
Relative abundance is scaled within each genus, with yellow being highest in abundance and purple lowest. Blobs are scaled to maximum relative abundance. Stars denote genera that are significantly differentially abundant between body sites (ANCOM2 0.9).
plot_heatmap(new_genus, ancom[0])
These differences are determined by ANCOM2 and the most conservative of these is the ANCOM 0.9 set, which is then used for plotting heatmaps.
this_genera_db = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/db_genera/refseq_conf.csv', header=0, index_col=0)
comparisons, md, new_genus, comp_len = get_differences(this_genera_db, db_name[1])
c1, c2, c3, c4, c5, c6, c7, c8 = comparisons[0], comparisons[1], comparisons[2], comparisons[3], comparisons[4], comparisons[5], comparisons[6], comparisons[7]
md1, md2, md3, md4, md5, md6, md7, md8 = md[0], md[1], md[2], md[3], md[4], md[5], md[6], md[7]
ft = list(py$c1, py$c2, py$c3, py$c4, py$c5, py$c6, py$c7, py$c8)
md = list(py$md1, py$md2, py$md3, py$md4, py$md5, py$md6, py$md7, py$md8)
all_ancom = get_ancom(ft, md)
ancom = sort_ancom_results(r.all_ancom)
ancom_df = []
for c in range(len(comparison_names)):
this_row = [comparison_names[c], comp_len[c], len(ancom[3][c]), len(ancom[2][c]), len(ancom[1][c]), len(ancom[0][c])]
ancom_df.append(this_row)
ancom_df = pd.DataFrame(ancom_df, columns=['Comparison', 'Shared genera', 'ANCOM 0.6', 'ANCOM 0.7', 'ANCOM 0.8', 'ANCOM 0.9'])
py$ancom_df %>%
kable() %>%
kable_styling()
| Comparison | Shared genera | ANCOM 0.6 | ANCOM 0.7 | ANCOM 0.8 | ANCOM 0.9 |
|---|---|---|---|---|---|
| Blood vs saliva | 85 | 81 | 77 | 35 | 11 |
| Blood vs buccal | 82 | 65 | 36 | 21 | 7 |
| Saliva vs buccal | 69 | 9 | 6 | 3 | 0 |
| Saliva vs saliva_euk | 64 | 0 | 0 | 0 | 0 |
| Buccal vs buccal_euk | 76 | 4 | 4 | 1 | 0 |
| Blood vs saliva_euk | 86 | 72 | 37 | 20 | 4 |
| Blood vs buccal_euk | 79 | 18 | 15 | 8 | 2 |
| Saliva_euk vs buccal_euk | 84 | 12 | 10 | 5 | 1 |
Relative abundance is scaled within each genus, with yellow being highest in abundance and purple lowest. Blobs are scaled to maximum relative abundance. Stars denote genera that are significantly differentially abundant between body sites (ANCOM2 0.9).
plot_heatmap(new_genus, ancom[0])
Now we are looking at the taxa (grouped to genus level, to hopefully allow for differences between databases) that are in common between the different databases that have been used with Kraken2. We are also using only the versions that have confidence=0.1.
The plots below here show the number of reads in each sample before and after removal of the genera that aren’t present in all databases.
#print(bracken_all_db)
def intersection(lst1, lst2):
lst3 = [value for value in lst1 if value in lst2]
return lst3
conf_bracken_genus = []
genus_in_each = []
for a in range(len(bracken_all_db)):
if a not in [0, 2, 4, 6]:
continue
this_db = pd.DataFrame(bracken_all_db[a])
taxa = list(this_db.index.values)
tax_dict = {}
for b in range(len(taxa)):
orig_tax = taxa[b]
if 's__' in taxa[b]:
taxa[b] = taxa[b].replace('s__', '')
taxa[b] = taxa[b].split(' ')[0]
taxa[b] = taxa[b].replace("'", "")
tax_dict[orig_tax] = taxa[b]
this_db.rename(index=tax_dict, inplace=True)
genus = this_db.groupby(by=this_db.index, axis=0).sum()
conf_bracken_genus.append(genus)
genus_in_each.append(list(genus.index.values))
[print(len(gen)) for gen in genus_in_each]
overall_genus = intersection(genus_in_each[0], genus_in_each[1])
overall_genus = intersection(overall_genus, genus_in_each[2])
overall_genus = intersection(overall_genus, genus_in_each[3])
print(len(overall_genus))
fig = plt.figure(figsize=(10,6))
ax = [plt.subplot(223), plt.subplot(221), plt.subplot(222), plt.subplot(224)]
x1 = [x for x in range(21)]
x2 = [x+0.4 for x in range(21)]
x3 = [x+0.2 for x in range(21)]
conf_bracken_overall = []
names = ['gtdb', 'minikrakenv1', 'minikrakenv2', 'refseq']
labels = ['GTDB', 'Minikraken v1 (without human)', 'Minikraken v2 (with human)', 'RefSeq Complete v93']
sum_reduced = []
for db in range(len(conf_bracken_genus)):
new_db = pd.DataFrame(conf_bracken_genus[db].loc[overall_genus, :])
conf_bracken_overall.append(new_db)
new_db.to_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/db_genera_in_common/'+names[db]+'_common.csv')
conf_bracken_genus[db].to_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/db_genera_in_common/'+names[db]+'.csv')
sums = new_db.sum(axis=0)
sum_reduced.append(sums)
ax[db].bar(x2, sums, color=colors, edgecolor='k', width=0.4, alpha=0.5)
sums = conf_bracken_genus[db].sum(axis=0)
ax[db].bar(x1, sums, color=colors, edgecolor='k', width=0.4)
ax[db].semilogy()
ax[db].set_title(labels[db])
plt.sca(ax[db])
if db == 1 or db == 2:
plt.xticks([])
else:
plt.xticks(x3, sample_names, rotation=90)
ax[0].set_ylabel('Number of reads'), ax[1].set_ylabel('Number of reads')
handles = [Patch(facecolor='k', edgecolor='k', label='All genera'), Patch(facecolor='k', edgecolor='k', alpha=0.5, label='Genera present in all')]
ax[2].legend(handles=handles, loc='upper left', bbox_to_anchor=(1,1))
plt.tight_layout()
plt.show()
fig = plt.figure(figsize=(8,4))
ax = plt.subplot(111)
x1 = [x*5 for x in range(21)]
x2 = [x+1 for x in x1]
x3 = [x+1 for x in x2]
x4 = [x+1 for x in x3]
xplt = [x+0.5 for x in x2]
x = [x3, x1, x2, x4]
al = [1, 0.8, 0.6, 0.4]
for db in range(len(sum_reduced)):
ax.bar(x[db], sum_reduced[db], color=colors, width=1, edgecolor='k', alpha=al[db])
handles = [Patch(facecolor='k', edgecolor='k', label='Minikraken v1'), Patch(facecolor='k', edgecolor='k', alpha=0.8, label='Minikraken v2'), Patch(facecolor='k', edgecolor='k', alpha=0.6, label='GTDB'), Patch(facecolor='k', edgecolor='k', alpha=0.4, label='RefSeq Complete v93')]
ax.legend(handles=handles, loc='upper left', bbox_to_anchor=(1,1))
plt.xticks(xplt, sample_names, rotation=90)
plt.semilogy()
plt.ylabel('Number of reads')
plt.tight_layout()
plt.show()
for db in range(len(conf_bracken_overall)):
rename_samples = {}
for sample in list(conf_bracken_overall[db].columns):
rename_samples[sample] = sample+'_'+names[db]
this_db = pd.DataFrame(conf_bracken_overall[db].rename(columns=rename_samples))
if db == 0:
overall_genus = this_db
else:
overall_genus = pd.concat([overall_genus, this_db], axis=1)
overall_genus = pd.DataFrame(overall_genus)
#overall_genus = pd.DataFrame(overall_genus.divide(overall_genus.sum(axis=0)).multiply(100))
overall_genus = overall_genus[overall_genus.max(axis=1) > 25000]
sums = overall_genus.sum(axis=0)
gen_colors = get_cols(overall_genus.shape[0])
print(overall_genus)
fig = plt.figure(figsize=(14,10))
ax1 = plt.subplot(234)
ax = [ax1, plt.subplot(231, sharey=ax1), plt.subplot(232, sharey=ax1), plt.subplot(235, sharey=ax1)]
ax[1].set_ylabel('Number of reads')
ax[0].set_ylabel('Number of reads')
x1 = [x for x in range(21)]
a = 0
col_samples = list(overall_genus.columns)
#line 20
new_db = []
for name in names:
keeping = []
for s in list(overall_genus.columns):
if name in s:
keeping.append(True)
else:
keeping.append(False)
other_new_db = pd.DataFrame(overall_genus.loc[:, keeping])
new_db.append(other_new_db)
for db in range(len(new_db)):
this_genera = list(new_db[db].index.values)
handles = []
for g in range(len(this_genera)):
if g == 0:
bottom = [0 for c in x1]
these_values = new_db[db].loc[this_genera[g], :].values
ax[db].bar(x1, these_values, bottom=bottom, color=gen_colors[g], edgecolor='k', width=1)
handles.append(Patch(facecolor=gen_colors[g], edgecolor='k', label=this_genera[g]))
bottom = [bottom[x]+these_values[x] for x in range(len(bottom))]
plt.sca(ax[db])
if db == 1 or db == 2:
plt.xticks([])
else:
plt.xticks(x1, sample_names, rotation=90)
plt.semilogy()
ax[2].legend(handles=handles, loc='upper left', bbox_to_anchor=(1,1), ncol=3)
#plt.tight_layout()
plt.show()
for db in range(len(conf_bracken_overall)):
rename_samples = {}
for sample in list(conf_bracken_overall[db].columns):
rename_samples[sample] = sample+'_'+names[db]
this_db = pd.DataFrame(conf_bracken_overall[db].rename(columns=rename_samples))
if db == 0:
overall_genus = this_db
else:
overall_genus = pd.concat([overall_genus, this_db], axis=1)
overall_genus = pd.DataFrame(overall_genus)
overall_genus = pd.DataFrame(overall_genus.divide(overall_genus.sum(axis=0)).multiply(100))
overall_genus = overall_genus[overall_genus.max(axis=1) > 1]
sums = overall_genus.sum(axis=0)
gen_colors = get_cols(overall_genus.shape[0])
print(overall_genus)
fig = plt.figure(figsize=(14,10))
ax1 = plt.subplot(234)
ax = [ax1, plt.subplot(231, sharey=ax1), plt.subplot(232, sharey=ax1), plt.subplot(235, sharey=ax1)]
x1 = [x for x in range(21)]
a = 0
col_samples = list(overall_genus.columns)
#line 20
new_db = []
for name in names:
keeping = []
for s in list(overall_genus.columns):
if name in s:
keeping.append(True)
else:
keeping.append(False)
other_new_db = pd.DataFrame(overall_genus.loc[:, keeping])
new_db.append(other_new_db)
for db in range(len(new_db)):
this_genera = list(new_db[db].index.values)
handles = []
for g in range(len(this_genera)):
if g == 0:
bottom = [0 for c in x1]
these_values = new_db[db].loc[this_genera[g], :].values
ax[db].bar(x1, these_values, bottom=bottom, color=gen_colors[g], edgecolor='k', width=1)
handles.append(Patch(facecolor=gen_colors[g], edgecolor='k', label=this_genera[g]))
bottom = [bottom[x]+these_values[x] for x in range(len(bottom))]
plt.sca(ax[db])
if db == 1 or db == 2:
plt.xticks([])
else:
plt.xticks(x1, sample_names, rotation=90)
ax[2].legend(handles=handles, loc='upper left', bbox_to_anchor=(1,1), ncol=3)
plt.tight_layout()
plt.show()
Function is profiled using (using the commands given above):
1. HUMAnN2
- Uniref90 database
- Uniref50 database
2. HUMAnN3
- Uniref90 and Uniref50
- HUMAnN3 runs both Uniref50 and Uniref90 at the same time, where these needed to be run separately for HUMAnN2
plt.figure(figsize=(8,6))
ax1 = plt.subplot(111)
#unaligned_after_translation_uniref_90
#unaligned_after_translation_uniref_50
x1 = [x for x in range(21)]
x2 = [x+0.4 for x in range(21)]
x = [x+0.2 for x in range(21)]
ax1.bar(x1, reads.loc[:, 'unaligned_after_translation_uniref_90'].values, color=colors, edgecolor='k', width=0.4)
ax1.bar(x2, reads.loc[:, 'unaligned_after_translation_uniref_50'].values, color=colors, edgecolor='k', width=0.4, alpha=0.5)
plt.xticks(x, sample_names, rotation=90)
plt.ylabel('Unaligned reads after translation (%)')
plt.xlim([-0.5,21])
handles = [Patch(facecolor='k', edgecolor='k', label='Uniref90'), Patch(facecolor='k', edgecolor='k', alpha=0.5, label='Uniref50')]
ax1.legend(handles=handles, bbox_to_anchor=(1,1))
plt.tight_layout()
plt.show()
## HUMAnN2 richness {.tabset}
After running HUMAnN2, the pathabundance, pathcoverage and genefamilies tables are joined, and the pathabundance is renormalised (relative abundance calculated) and split to tables that are stratified by species or not. Here we use the unstratified pathabundance file (the MetaPhlAn2 species results don’t seem to have been particularly accurate). The genefamilies file is stratified by default so we take only the overall values for the gene family (removing all species results) and re-calculate these values as relative abundances.
def get_richness(df):
snames = list(df.columns)
num_genes = []
for sn in snames:
one_sample = pd.DataFrame(df.loc[:, sn])
one_sample = one_sample[one_sample.max(axis=1) > 0]
num_genes.append(one_sample.shape[0])
return snames, num_genes
genes_90 = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/metaphlan_humann/processing/humann2_final_out_90/humann2_genefamilies.tsv', header=0, index_col=0, sep='\t')
genes_90.drop('UNMAPPED', axis=0, inplace=True)
gene_names = list(genes_90.index.values)
keeping = []
for gn in gene_names:
new_gn = gn.split('|')[0]
if gn == new_gn: keeping.append(True)
else: keeping.append(False)
genes_90_abs = genes_90.loc[keeping, :]
genes_90 = genes_90_abs.divide(genes_90_abs.sum(axis=0)).multiply(100)
pathways_90 = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/metaphlan_humann/processing/humann2_final_out_90/humann2_pathabundance_relab_unstratified.tsv', header=0, index_col=0, sep='\t')
snames_genes_90, genes_90_richness = get_richness(genes_90)
snames_pways_90, pways_90_richness = get_richness(pathways_90)
genes_50 = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/metaphlan_humann/processing/humann2_final_out_50/humann2_genefamilies.tsv', header=0, index_col=0, sep='\t')
genes_50.drop('UNMAPPED', axis=0, inplace=True)
gene_names = list(genes_50.index.values)
keeping = []
for gn in gene_names:
new_gn = gn.split('|')[0]
if gn == new_gn: keeping.append(True)
else: keeping.append(False)
genes_50_abs = genes_50.loc[keeping, :]
genes_50 = genes_50_abs.divide(genes_50_abs.sum(axis=0)).multiply(100)
pathways_50 = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/metaphlan_humann/processing/humann2_final_out_50/humann2_pathabundance_relab_unstratified.tsv', header=0, index_col=0, sep='\t')
snames_genes_50, genes_50_richness = get_richness(genes_50)
snames_pways_50, pways_50_richness = get_richness(pathways_50)
plt.figure(figsize=(6,6))
ax1 = plt.subplot(211)
ax2 = plt.subplot(212)
x = [a for a in range(len(genes_90.columns))]
x1 = [a+0.4 for a in x]
ax1.bar(x, genes_90_richness, width=0.4, color=colors, edgecolor='k')
ax2.bar(x, genes_90_richness, width=0.4, color=colors, edgecolor='k')
ax1.bar(x1[:len(genes_50_richness)], genes_50_richness, width=0.4, color=colors, edgecolor='k', alpha=0.5)
ax2.bar(x1[:len(genes_50_richness)], genes_50_richness, width=0.4, color=colors, edgecolor='k', alpha=0.5)
plt.sca(ax1)
plt.xlim([-0.5,21]), plt.xticks([])
handles = [Patch(facecolor='k', edgecolor='k', label='Uniref90'), Patch(facecolor='k', edgecolor='k', alpha=0.5, label='Uniref50')]
plt.legend(handles=handles, bbox_to_anchor=(1,1))
plt.ylabel('Gene family\nrichness')
plt.sca(ax2)
plt.semilogy()
plt.xlim([-0.5,21])
plt.ylabel('Gene family\nrichness (log scale)')
plt.xticks(x, sample_names, rotation=90)
plt.tight_layout()
plt.show()
#54
plt.figure(figsize=(6,6))
ax1 = plt.subplot(211)
ax2 = plt.subplot(212)
x = [a for a in range(len(genes_90.columns))]
x1 = [a+0.4 for a in x]
ax1.bar(x, pways_90_richness, width=0.4, color=colors, edgecolor='k')
ax1.bar(x1, pways_50_richness, width=0.4, color=colors, edgecolor='k', alpha=0.5)
ax2.bar(x, pways_90_richness, width=0.4, color=colors, edgecolor='k')
ax2.bar(x1, pways_50_richness, width=0.4, color=colors, edgecolor='k', alpha=0.5)
plt.sca(ax1)
plt.xticks([])
plt.ylabel('Pathway richness')
handles = [Patch(facecolor='k', edgecolor='k', label='Uniref90'), Patch(facecolor='k', edgecolor='k', alpha=0.5, label='Uniref50')]
plt.legend(handles=handles, bbox_to_anchor=(1,1))
plt.xlim([-0.5,21])
plt.sca(ax2)
plt.semilogy()
plt.xlim([-0.5,21]), plt.xticks(x, sample_names, rotation=90)
plt.ylabel('Pathway richness (log scale)')
plt.tight_layout()
plt.show()
Now we are looking at the gene families and pathways that are differentially abundant between body sites, using the lists that are output by HUMAnN2.
Volcano plot showing the number of differentially abundant pathways between body sites. Colored points denote genes that are significantly differentially abundant (T-test p<0.05 and log2 (median) FC > 2 or < -2) and they are colored by which body site they are highest in abundance in. Note that very large fold changes are not shown. Initial results showed that there were very few genes with > 10 or < -10, so the maximum plotted is this.
def get_diff_abundant(genes, name_csv):
col_names = list(genes.columns)
col_names_dict = {}
for cn in col_names:
if 'SRR8595488' in cn:
genes.drop([cn], axis=1, inplace=True)
cn_new = cn.split('_')[0]
col_names_dict[cn] = site_dict[cn_new]
new_genes = genes.rename(columns=col_names_dict)
comparisons = [['Saliva', 'Blood'], ['Buccal', 'Blood'], ['Saliva', 'Buccal'], ['Saliva', 'Saliva_euk'], ['Buccal', 'Buccal_euk'], ['Blood', 'Saliva_euk'], ['Blood', 'Buccal_euk']]
fig = plt.figure(figsize=(15,8))
ax = [plt.subplot2grid((2,8), (0,1), colspan=2), plt.subplot2grid((2,8), (0,3), colspan=2), plt.subplot2grid((2,8), (0,5), colspan=2), plt.subplot2grid((2,8), (1,0), colspan=2), plt.subplot2grid((2,8), (1,2), colspan=2), plt.subplot2grid((2,8), (1,4), colspan=2), plt.subplot2grid((2,8), (1,6), colspan=2)]
for a in range(len(comparisons)):
ax[a].set_title(comparisons[a][0]+r' $vs$ '+comparisons[a][1])
this_comparison = new_genes.loc[:, comparisons[a]]
this_comparison = this_comparison[this_comparison.max(axis=1) > 0]
genes_comp = list(this_comparison.index.values)
fc, pval, colors_p, this_fc = [], [], [], []
for b in range(len(genes_comp)):
c1, c2 = list(this_comparison.loc[genes_comp[b], comparisons[a][0]]), list(this_comparison.loc[genes_comp[b], comparisons[a][1]])
c1, c2 = [0.000001 if x == 0 else x for x in c1], [0.000001 if x == 0 else x for x in c2]
c1, c2 = [math.log2(c1[c]) for c in range(len(c1))], [math.log2(c2[c]) for c in range(len(c2))]
t, p = stats.ttest_ind(c1, c2)
c1, c2 = np.median(c1), np.median(c2)
diff = c1/c2
if diff < 1 and diff > 0: diff = -(1/diff)
fc.append(diff), pval.append(p)
this_fc.append([genes_comp[b], diff, p])
colors_p_all = [colors_dict[comparisons[a][0]], colors_dict[comparisons[a][1]]]
for c in range(len(fc)):
if fc[c] >= 2 and pval[c] <= 0.05: colors_p.append(colors_p_all[1])
elif fc[c] <= -2 and pval[c] <= 0.05: colors_p.append(colors_p_all[0])
else: colors_p.append('#B1B1B1')
pval[c] = -math.log10(pval[c])
com = comparisons[a][0]+'_'+comparisons[a][1]
if a == 0:
df_pval = pd.DataFrame(this_fc, columns=['Genes/pathways', com+'_FC', com+'_p'])
df_pval.set_index('Genes/pathways', inplace=True)
else:
new_df = pd.DataFrame(this_fc, columns=['Genes/pathways', com+'_FC', com+'_p'])
new_df.set_index('Genes/pathways', inplace=True)
df_pval = pd.concat([df_pval, new_df], axis=1, join='outer')
ma = max([max(fc), abs(min(fc))])+0.5
if ma > 10: ma = 10.5
ax[a].set_xlim([-ma, ma])
ax[a].scatter(fc, pval, marker='o', c=colors_p, s=10)
ax[a].set_xlabel(r'Log$_2$ fold change')
if a == 0 or a == 3:
ax[a].set_ylabel('-log(p value)')
plt.sca(ax[a])
df_pval.to_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/metaphlan_humann/processing/'+name_csv+'.csv')
handles = [Patch(facecolor=colors_dict[color], edgecolor='k', label=color) for color in colors_dict]
ax[2].legend(handles=handles, bbox_to_anchor=(1.04,1), loc='upper left')
return
pways = pd.DataFrame(pathways_90)
get_diff_abundant(pways, 'pathways_90')
plt.subplots_adjust(hspace=0.5, wspace=0.5)
plt.show()
Volcano plot showing the number of differentially abundant pathways between body sites. Colored points denote genes that are significantly differentially abundant (T-test p<0.05 and log2 (median) FC > 2 or < -2) and they are colored by which body site they are highest in abundance in. Note that very large fold changes are not shown. Initial results showed that there were very few genes with > 10 or < -10, so the maximum plotted is this.
pways = pd.DataFrame(pathways_50)
get_diff_abundant(pways, 'pathways_50')
plt.subplots_adjust(hspace=0.5, wspace=0.5)
plt.show()
Volcano plot showing the number of differentially abundant pathways between body sites. Colored points denote genes that are significantly differentially abundant (T-test p<0.05 and log2 (median) FC > 2 or < -2) and they are colored by which body site they are highest in abundance in. Note that very large fold changes are not shown. Initial results showed that there were very few genes with > 10 or < -10, so the maximum plotted is this.
pways = pd.DataFrame(genes_90)
get_diff_abundant(pways, 'genes_90')
plt.subplots_adjust(hspace=0.5, wspace=0.5)
plt.show()
Volcano plot showing the number of differentially abundant pathways between body sites. Colored points denote genes that are significantly differentially abundant (T-test p<0.05 and log2 (median) FC > 2 or < -2) and they are colored by which body site they are highest in abundance in. Note that very large fold changes are not shown. Initial results showed that there were very few genes with > 10 or < -10, so the maximum plotted is this.
pways = pd.DataFrame(genes_50)
get_diff_abundant(pways, 'genes_50')
plt.subplots_adjust(hspace=0.5, wspace=0.5)
plt.show()